Introduction: This is an exploratory data analysis for the data set between Beers.csv and Breweries.csv for CEO and CFO of Budweiser
Doing intial inspection of data through various methods below.
# dataset should be in the same folder of this RMD file
Beers = read.csv("/Users/mingyang/Desktop/SMU/DoingDS_Fall2020/MSDS6306-Case-Study1/Beers.csv",header = TRUE) #loading beers dataset
Breweries = read.csv("/Users/mingyang/Desktop/SMU/DoingDS_Fall2020/MSDS6306-Case-Study1/Breweries.csv",header = TRUE) #loading breweries dataset
#below this line is for self analyzation
#summary(Beers)
#str(Beers)
##Beers$IBU
#summary(Breweries)
#str(Breweries)
#Above this line is for self analyzing can be deleted later
#Turn Breweries State column into a factor
Breweries$State = as.factor(Breweries$State)
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.0
## ✓ tidyr 1.1.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
num_Breweries_by_state = Breweries %>% group_by(State) %>%
summarise(count=n())
## `summarise()` ungrouping output (override with `.groups` argument)
num_Breweries_by_state
## # A tibble: 51 x 2
## State count
## <fct> <int>
## 1 " AK" 7
## 2 " AL" 3
## 3 " AR" 2
## 4 " AZ" 11
## 5 " CA" 39
## 6 " CO" 47
## 7 " CT" 8
## 8 " DC" 1
## 9 " DE" 2
## 10 " FL" 15
## # … with 41 more rows
#As we can see the number of breweries per state is in the list below, to see this better we will use a plot to show results
ggplot(data=num_Breweries_by_state)+
geom_bar(mapping=aes(x=State,y=count,fill=State),stat="identity") +
ggtitle("Breweries count by state")+xlab("State")+ylab("Count of Breweries")
#To see this by descend of count
ggplot(data=num_Breweries_by_state)+
geom_bar(mapping=aes(x=reorder(State,-count),y=count,fill=State),stat="identity") +
ggtitle("Breweries count by state")+xlab("State")+ylab("Count of Breweries")
Beers = Beers %>% rename(Brew_ID= Brewery_id)
Beers.with.Breweries = left_join(Beers,Breweries, by = "Brew_ID")
Beers.with.Breweries = Beers.with.Breweries %>% rename(Beer_Name= Name.x)
Beers.with.Breweries = Beers.with.Breweries %>% rename(Brew_Name= Name.y)
head(Beers.with.Breweries,6)
## Beer_Name Beer_ID ABV IBU Brew_ID Style
## 1 Pub Beer 1436 0.050 NA 409 American Pale Lager
## 2 Devil's Cup 2265 0.066 NA 178 American Pale Ale (APA)
## 3 Rise of the Phoenix 2264 0.071 NA 178 American IPA
## 4 Sinister 2263 0.090 NA 178 American Double / Imperial IPA
## 5 Sex and Candy 2262 0.075 NA 178 American IPA
## 6 Black Exodus 2261 0.077 NA 178 Oatmeal Stout
## Ounces Brew_Name City State
## 1 12 10 Barrel Brewing Company Bend OR
## 2 12 18th Street Brewery Gary IN
## 3 12 18th Street Brewery Gary IN
## 4 12 18th Street Brewery Gary IN
## 5 12 18th Street Brewery Gary IN
## 6 12 18th Street Brewery Gary IN
summary(Beers.with.Breweries)
## Beer_Name Beer_ID ABV IBU
## Length:2410 Min. : 1.0 Min. :0.00100 Min. : 4.00
## Class :character 1st Qu.: 808.2 1st Qu.:0.05000 1st Qu.: 21.00
## Mode :character Median :1453.5 Median :0.05600 Median : 35.00
## Mean :1431.1 Mean :0.05977 Mean : 42.71
## 3rd Qu.:2075.8 3rd Qu.:0.06700 3rd Qu.: 64.00
## Max. :2692.0 Max. :0.12800 Max. :138.00
## NA's :62 NA's :1005
## Brew_ID Style Ounces Brew_Name
## Min. : 1.0 Length:2410 Min. : 8.40 Length:2410
## 1st Qu.: 94.0 Class :character 1st Qu.:12.00 Class :character
## Median :206.0 Mode :character Median :12.00 Mode :character
## Mean :232.7 Mean :13.59
## 3rd Qu.:367.0 3rd Qu.:16.00
## Max. :558.0 Max. :32.00
##
## City State
## Length:2410 CO : 265
## Class :character CA : 183
## Mode :character MI : 162
## IN : 139
## TX : 130
## OR : 125
## (Other):1406
library(mice) #Load mice library to analyze the pattern of missing data
##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
md.pattern(Beers.with.Breweries)
## Beer_Name Beer_ID Brew_ID Style Ounces Brew_Name City State ABV IBU
## 1405 1 1 1 1 1 1 1 1 1 1 0
## 943 1 1 1 1 1 1 1 1 1 0 1
## 62 1 1 1 1 1 1 1 1 0 0 2
## 0 0 0 0 0 0 0 0 62 1005 1067
# Since there is large amont of data missing in IBM column
#Try to impute the missing data with Predictive mean Matching method
tempData <- mice(Beers.with.Breweries,m=5,maxit=50,meth='pmm',seed=20)
##
## iter imp variable
## 1 1 ABV IBU
## 1 2 ABV IBU
## 1 3 ABV IBU
## 1 4 ABV IBU
## 1 5 ABV IBU
## 2 1 ABV IBU
## 2 2 ABV IBU
## 2 3 ABV IBU
## 2 4 ABV IBU
## 2 5 ABV IBU
## 3 1 ABV IBU
## 3 2 ABV IBU
## 3 3 ABV IBU
## 3 4 ABV IBU
## 3 5 ABV IBU
## 4 1 ABV IBU
## 4 2 ABV IBU
## 4 3 ABV IBU
## 4 4 ABV IBU
## 4 5 ABV IBU
## 5 1 ABV IBU
## 5 2 ABV IBU
## 5 3 ABV IBU
## 5 4 ABV IBU
## 5 5 ABV IBU
## 6 1 ABV IBU
## 6 2 ABV IBU
## 6 3 ABV IBU
## 6 4 ABV IBU
## 6 5 ABV IBU
## 7 1 ABV IBU
## 7 2 ABV IBU
## 7 3 ABV IBU
## 7 4 ABV IBU
## 7 5 ABV IBU
## 8 1 ABV IBU
## 8 2 ABV IBU
## 8 3 ABV IBU
## 8 4 ABV IBU
## 8 5 ABV IBU
## 9 1 ABV IBU
## 9 2 ABV IBU
## 9 3 ABV IBU
## 9 4 ABV IBU
## 9 5 ABV IBU
## 10 1 ABV IBU
## 10 2 ABV IBU
## 10 3 ABV IBU
## 10 4 ABV IBU
## 10 5 ABV IBU
## 11 1 ABV IBU
## 11 2 ABV IBU
## 11 3 ABV IBU
## 11 4 ABV IBU
## 11 5 ABV IBU
## 12 1 ABV IBU
## 12 2 ABV IBU
## 12 3 ABV IBU
## 12 4 ABV IBU
## 12 5 ABV IBU
## 13 1 ABV IBU
## 13 2 ABV IBU
## 13 3 ABV IBU
## 13 4 ABV IBU
## 13 5 ABV IBU
## 14 1 ABV IBU
## 14 2 ABV IBU
## 14 3 ABV IBU
## 14 4 ABV IBU
## 14 5 ABV IBU
## 15 1 ABV IBU
## 15 2 ABV IBU
## 15 3 ABV IBU
## 15 4 ABV IBU
## 15 5 ABV IBU
## 16 1 ABV IBU
## 16 2 ABV IBU
## 16 3 ABV IBU
## 16 4 ABV IBU
## 16 5 ABV IBU
## 17 1 ABV IBU
## 17 2 ABV IBU
## 17 3 ABV IBU
## 17 4 ABV IBU
## 17 5 ABV IBU
## 18 1 ABV IBU
## 18 2 ABV IBU
## 18 3 ABV IBU
## 18 4 ABV IBU
## 18 5 ABV IBU
## 19 1 ABV IBU
## 19 2 ABV IBU
## 19 3 ABV IBU
## 19 4 ABV IBU
## 19 5 ABV IBU
## 20 1 ABV IBU
## 20 2 ABV IBU
## 20 3 ABV IBU
## 20 4 ABV IBU
## 20 5 ABV IBU
## 21 1 ABV IBU
## 21 2 ABV IBU
## 21 3 ABV IBU
## 21 4 ABV IBU
## 21 5 ABV IBU
## 22 1 ABV IBU
## 22 2 ABV IBU
## 22 3 ABV IBU
## 22 4 ABV IBU
## 22 5 ABV IBU
## 23 1 ABV IBU
## 23 2 ABV IBU
## 23 3 ABV IBU
## 23 4 ABV IBU
## 23 5 ABV IBU
## 24 1 ABV IBU
## 24 2 ABV IBU
## 24 3 ABV IBU
## 24 4 ABV IBU
## 24 5 ABV IBU
## 25 1 ABV IBU
## 25 2 ABV IBU
## 25 3 ABV IBU
## 25 4 ABV IBU
## 25 5 ABV IBU
## 26 1 ABV IBU
## 26 2 ABV IBU
## 26 3 ABV IBU
## 26 4 ABV IBU
## 26 5 ABV IBU
## 27 1 ABV IBU
## 27 2 ABV IBU
## 27 3 ABV IBU
## 27 4 ABV IBU
## 27 5 ABV IBU
## 28 1 ABV IBU
## 28 2 ABV IBU
## 28 3 ABV IBU
## 28 4 ABV IBU
## 28 5 ABV IBU
## 29 1 ABV IBU
## 29 2 ABV IBU
## 29 3 ABV IBU
## 29 4 ABV IBU
## 29 5 ABV IBU
## 30 1 ABV IBU
## 30 2 ABV IBU
## 30 3 ABV IBU
## 30 4 ABV IBU
## 30 5 ABV IBU
## 31 1 ABV IBU
## 31 2 ABV IBU
## 31 3 ABV IBU
## 31 4 ABV IBU
## 31 5 ABV IBU
## 32 1 ABV IBU
## 32 2 ABV IBU
## 32 3 ABV IBU
## 32 4 ABV IBU
## 32 5 ABV IBU
## 33 1 ABV IBU
## 33 2 ABV IBU
## 33 3 ABV IBU
## 33 4 ABV IBU
## 33 5 ABV IBU
## 34 1 ABV IBU
## 34 2 ABV IBU
## 34 3 ABV IBU
## 34 4 ABV IBU
## 34 5 ABV IBU
## 35 1 ABV IBU
## 35 2 ABV IBU
## 35 3 ABV IBU
## 35 4 ABV IBU
## 35 5 ABV IBU
## 36 1 ABV IBU
## 36 2 ABV IBU
## 36 3 ABV IBU
## 36 4 ABV IBU
## 36 5 ABV IBU
## 37 1 ABV IBU
## 37 2 ABV IBU
## 37 3 ABV IBU
## 37 4 ABV IBU
## 37 5 ABV IBU
## 38 1 ABV IBU
## 38 2 ABV IBU
## 38 3 ABV IBU
## 38 4 ABV IBU
## 38 5 ABV IBU
## 39 1 ABV IBU
## 39 2 ABV IBU
## 39 3 ABV IBU
## 39 4 ABV IBU
## 39 5 ABV IBU
## 40 1 ABV IBU
## 40 2 ABV IBU
## 40 3 ABV IBU
## 40 4 ABV IBU
## 40 5 ABV IBU
## 41 1 ABV IBU
## 41 2 ABV IBU
## 41 3 ABV IBU
## 41 4 ABV IBU
## 41 5 ABV IBU
## 42 1 ABV IBU
## 42 2 ABV IBU
## 42 3 ABV IBU
## 42 4 ABV IBU
## 42 5 ABV IBU
## 43 1 ABV IBU
## 43 2 ABV IBU
## 43 3 ABV IBU
## 43 4 ABV IBU
## 43 5 ABV IBU
## 44 1 ABV IBU
## 44 2 ABV IBU
## 44 3 ABV IBU
## 44 4 ABV IBU
## 44 5 ABV IBU
## 45 1 ABV IBU
## 45 2 ABV IBU
## 45 3 ABV IBU
## 45 4 ABV IBU
## 45 5 ABV IBU
## 46 1 ABV IBU
## 46 2 ABV IBU
## 46 3 ABV IBU
## 46 4 ABV IBU
## 46 5 ABV IBU
## 47 1 ABV IBU
## 47 2 ABV IBU
## 47 3 ABV IBU
## 47 4 ABV IBU
## 47 5 ABV IBU
## 48 1 ABV IBU
## 48 2 ABV IBU
## 48 3 ABV IBU
## 48 4 ABV IBU
## 48 5 ABV IBU
## 49 1 ABV IBU
## 49 2 ABV IBU
## 49 3 ABV IBU
## 49 4 ABV IBU
## 49 5 ABV IBU
## 50 1 ABV IBU
## 50 2 ABV IBU
## 50 3 ABV IBU
## 50 4 ABV IBU
## 50 5 ABV IBU
## Warning: Number of logged events: 254
#summary(tempData)
# completed dataset after adding in generated predictive values
completedData <- complete(tempData,1)
#head(completedData)
# Density plot original vs imputed dataset
densityplot(tempData)
#Note: idea used above to impute data is from link below:
#https://datascienceplus.com/imputing-missing-data-with-r-mice-package/
#Compute and display Median of ABV and IBU by state:
median = completedData %>% group_by(State) %>%
summarize(median_ABV=median(ABV),median_IBU=median(IBU))
## `summarise()` ungrouping output (override with `.groups` argument)
median
## # A tibble: 51 x 3
## State median_ABV median_IBU
## <fct> <dbl> <dbl>
## 1 " AK" 0.056 35
## 2 " AL" 0.06 39.5
## 3 " AR" 0.052 72
## 4 " AZ" 0.055 28
## 5 " CA" 0.058 36
## 6 " CO" 0.061 35
## 7 " CT" 0.06 32
## 8 " DC" 0.0625 75
## 9 " DE" 0.0705 52
## 10 " FL" 0.0555 36.5
## # … with 41 more rows
#Draw Bar Charts to compare
#First plot median of alcohol content using modified data
median %>% ggplot()+
geom_bar(mapping=aes(x=reorder(State,-median_ABV),y=median_ABV,fill=State),stat="identity") +
ggtitle("Median Alcohol content by State on modified dataset")+xlab("State")+ylab("Alcohol Content Percentage")
#Below is result of using complete data set with missing data to plot median of alcohol content
Beers.with.Breweries %>% group_by(State) %>%
summarize(median_ABV=median(ABV),median_IBU=median(IBU))%>% ggplot()+
geom_bar(mapping=aes(x=reorder(State,-median_ABV),y=median_ABV,fill=State),stat="identity")+
ggtitle("Median Alcohol content by State on non-Modified dataset")+xlab("State")+ylab("Alcohol Content Percentage")
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning: Removed 18 rows containing missing values (position_stack).
#Below is result of plotting median international bitterness unit for each state on modified data set
median %>% ggplot()+
geom_bar(mapping=aes(x=reorder(State,-median_IBU),y=median_IBU,fill=State),stat="identity") +
ggtitle("Median International Bitterness Unit by State on modified dataset")+xlab("State")+ylab("International Bitterness Unit")
#Below is result of using complete data set with missing data to plot median of alcohol content
Beers.with.Breweries %>% group_by(State) %>%
summarize(median_ABV=median(ABV),median_IBU=median(IBU))%>% ggplot()+
geom_bar(mapping=aes(x=reorder(State,-median_IBU),y=median_IBU,fill=State),stat="identity") +
ggtitle("Median International Bitterness Unit by State on non-modified dataset")+xlab("State")+ylab("International Bitterness Unit")
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning: Removed 47 rows containing missing values (position_stack).
# Discover which state has the maximum alcoholic beer
head(completedData %>%
arrange(desc(ABV)) %>%
select(State,ABV,Beer_Name))
## State ABV Beer_Name
## 1 CO 0.128 Lee Hill Series Vol. 5 - Belgian Style Quadrupel Ale
## 2 KY 0.125 London Balling
## 3 IN 0.120 Csar
## 4 CO 0.104 Lee Hill Series Vol. 4 - Manhattan Style Rye Ale
## 5 NY 0.100 4Beans
## 6 CA 0.099 Lower De Boom
# Discover with un-changed data set
head(Beers.with.Breweries %>%
arrange(desc(ABV)) %>%
select(State,ABV,Beer_Name))
## State ABV Beer_Name
## 1 CO 0.128 Lee Hill Series Vol. 5 - Belgian Style Quadrupel Ale
## 2 KY 0.125 London Balling
## 3 IN 0.120 Csar
## 4 CO 0.104 Lee Hill Series Vol. 4 - Manhattan Style Rye Ale
## 5 NY 0.100 4Beans
## 6 CA 0.099 Lower De Boom
# Discover which state has the most bitter (IBU) beer
head(completedData %>%
arrange(desc(IBU)) %>%
select(State,IBU,Beer_Name))
## State IBU Beer_Name
## 1 OR 138 Bitter Bitch Imperial IPA
## 2 MI 138 Zaison
## 3 MI 138 Lemon Shandy Tripel
## 4 VA 135 Troopers Alley IPA
## 5 MI 130 Escoffier Bretta Ale
## 6 MA 130 Dead-Eye DIPA
# Discover with un-changed data set
head(Beers.with.Breweries %>%
arrange(desc(IBU)) %>%
select(State,IBU,Beer_Name))
## State IBU Beer_Name
## 1 OR 138 Bitter Bitch Imperial IPA
## 2 VA 135 Troopers Alley IPA
## 3 MA 130 Dead-Eye DIPA
## 4 OH 126 Bay of Bengal Double IPA (2014)
## 5 MN 120 Abrasive Ale
## 6 VT 120 Heady Topper
#first explore modified data
completedData %>% select(ABV, IBU, State) %>%
ggplot(aes(x=ABV,y=IBU)) +
geom_point(position="jitter") +ggtitle("IBU vs. ABV -- modified data")
#next explore unmodified data
Beers.with.Breweries %>% select(ABV, IBU, State) %>%
ggplot(aes(x=ABV,y=IBU)) +
geom_point(position="jitter") +ggtitle("IBU vs. ABV -- unmodified data")
## Warning: Removed 1005 rows containing missing values (geom_point).
#In order to investigate the difference respect to IBU and ABV, first extract all name with Ales
#getting all bear name with ale in it (ignore the case factor)
all_ales = completedData %>% filter(str_detect(completedData$Beer_Name,regex("Ale",ignore.case=TRUE)))
india_pale_ales = all_ales %>%
filter(str_detect(all_ales$Beer_Name,regex("India Pale Ale",ignore.case=TRUE)))
other_ales = all_ales %>%
filter(!str_detect(all_ales$Beer_Name,regex("India Pale Ale",ignore.case=TRUE)))
# in order to effectively use KNN model, I decided to standardize percentage scale of ABV, so the effect of distance from ABV and IBU are roughly the same. I choose to time each ABV value by 1000 to accomplish this standarization
all_ales$ABV = all_ales$ABV * 1000
# Change other ales name to other ales in order to use KNN model to test whether we can use IBU and ABV to distinguish IPAs from others
all_ales = all_ales %>%
mutate(India.Pale.Ale.Or.Else = ifelse(str_detect(Beer_Name,regex("India Pale Ale",ignore.case=TRUE)),"India Pale Ale","Other Ale"))
#all_ales$India.Pale.Ale.Or.Else
# Start KNN test to see how good it is to use ABV and IBU to distinguish the Ales
library(class)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(e1071)
all_ales$India.Pale.Ale.Or.Else = as.factor(all_ales$India.Pale.Ale.Or.Else)
#Keep my result reproducible initially tried set.seed(100), try k=5
set.seed(101)
splitPerc = 0.8
trainIndices = sample(1:dim(all_ales)[1],round(splitPerc * dim(all_ales)[1]))
train = all_ales[trainIndices,]
test = all_ales[-trainIndices,]
# try k=5
classifications = knn(train[,3:4],test[,3:4],train$India.Pale.Ale.Or.Else, prob = TRUE, k = 5)
table(classifications,test$India.Pale.Ale.Or.Else)
##
## classifications India Pale Ale Other Ale
## India Pale Ale 2 2
## Other Ale 4 116
cm = confusionMatrix(table(classifications,test$India.Pale.Ale.Or.Else))
cm
## Confusion Matrix and Statistics
##
##
## classifications India Pale Ale Other Ale
## India Pale Ale 2 2
## Other Ale 4 116
##
## Accuracy : 0.9516
## 95% CI : (0.8977, 0.982)
## No Information Rate : 0.9516
## P-Value [Acc > NIR] : 0.6063
##
## Kappa : 0.3758
##
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.33333
## Specificity : 0.98305
## Pos Pred Value : 0.50000
## Neg Pred Value : 0.96667
## Prevalence : 0.04839
## Detection Rate : 0.01613
## Detection Prevalence : 0.03226
## Balanced Accuracy : 0.65819
##
## 'Positive' Class : India Pale Ale
##
# explore best possible K value for accuracy
set.seed(101)
iterations = 500
numks = 50
masterAcc = matrix(nrow = iterations, ncol = numks)
for(j in 1:iterations)
{
trainIndices = sample(1:dim(all_ales)[1],round(splitPerc * dim(all_ales)[1]))
train = all_ales[trainIndices,]
test = all_ales[-trainIndices,]
for(i in 1:numks)
{
classifications = knn(train[,3:4],test[,3:4],train$India.Pale.Ale.Or.Else, prob = TRUE, k = i)
table(classifications,test$India.Pale.Ale.Or.Else)
CM = confusionMatrix(table(classifications,test$India.Pale.Ale.Or.Else))
masterAcc[j,i] = CM$overall[1]
}
}
MeanAcc = colMeans(masterAcc)
plot(seq(1,numks,1),MeanAcc, type = "l",main="mean Accuracy vs. different K (number of neighbor used to predict)",
ylab="Mean Accuracy",xlab="k used")
#Try k=30 after finding it might give the most accuracy
#Keep my result reproducible initially tried set.seed(101)
set.seed(101)
splitPerc = 0.8
trainIndices = sample(1:dim(all_ales)[1],round(splitPerc * dim(all_ales)[1]))
train = all_ales[trainIndices,]
test = all_ales[-trainIndices,]
# try k=5
classifications = knn(train[,3:4],test[,3:4],train$India.Pale.Ale.Or.Else, prob = TRUE, k = 30)
table(classifications,test$India.Pale.Ale.Or.Else)
##
## classifications India Pale Ale Other Ale
## India Pale Ale 0 0
## Other Ale 6 118
cm = confusionMatrix(table(classifications,test$India.Pale.Ale.Or.Else))
cm
## Confusion Matrix and Statistics
##
##
## classifications India Pale Ale Other Ale
## India Pale Ale 0 0
## Other Ale 6 118
##
## Accuracy : 0.9516
## 95% CI : (0.8977, 0.982)
## No Information Rate : 0.9516
## P-Value [Acc > NIR] : 0.60634
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 0.04123
##
## Sensitivity : 0.00000
## Specificity : 1.00000
## Pos Pred Value : NaN
## Neg Pred Value : 0.95161
## Prevalence : 0.04839
## Detection Rate : 0.00000
## Detection Prevalence : 0.00000
## Balanced Accuracy : 0.50000
##
## 'Positive' Class : India Pale Ale
##
# explore best possible K value for sensitivity
set.seed(101)
iterations = 500
numks = 50
masterSen = matrix(nrow = iterations, ncol = numks)
for(j in 1:iterations)
{
trainIndices = sample(1:dim(all_ales)[1],round(splitPerc * dim(all_ales)[1]))
train = all_ales[trainIndices,]
test = all_ales[-trainIndices,]
for(i in 1:numks)
{
classifications = knn(train[,3:4],test[,3:4],train$India.Pale.Ale.Or.Else, prob = TRUE, k = i)
table(classifications,test$India.Pale.Ale.Or.Else)
CM = confusionMatrix(table(classifications,test$India.Pale.Ale.Or.Else))
masterSen[j,i] = CM$byClass[1]
}
}
MeanSen = colMeans(masterSen)
plot(seq(1,numks,1),MeanSen, type = "l",main="mean Sensitivity vs. different K (number of neighbor used to predict)",
ylab="Mean Sensitivity",xlab="k used")
#Try k=3 after finding it might give the most Sensitivity
#Keep my result reproducible initially tried set.seed(101)
set.seed(101)
splitPerc = 0.8
trainIndices = sample(1:dim(all_ales)[1],round(splitPerc * dim(all_ales)[1]))
train = all_ales[trainIndices,]
test = all_ales[-trainIndices,]
# try k=5
classifications = knn(train[,3:4],test[,3:4],train$India.Pale.Ale.Or.Else, prob = TRUE, k = 3)
table(classifications,test$India.Pale.Ale.Or.Else)
##
## classifications India Pale Ale Other Ale
## India Pale Ale 2 6
## Other Ale 4 112
cm = confusionMatrix(table(classifications,test$India.Pale.Ale.Or.Else))
cm
## Confusion Matrix and Statistics
##
##
## classifications India Pale Ale Other Ale
## India Pale Ale 2 6
## Other Ale 4 112
##
## Accuracy : 0.9194
## 95% CI : (0.8567, 0.9606)
## No Information Rate : 0.9516
## P-Value [Acc > NIR] : 0.9614
##
## Kappa : 0.2439
##
## Mcnemar's Test P-Value : 0.7518
##
## Sensitivity : 0.33333
## Specificity : 0.94915
## Pos Pred Value : 0.25000
## Neg Pred Value : 0.96552
## Prevalence : 0.04839
## Detection Rate : 0.01613
## Detection Prevalence : 0.06452
## Balanced Accuracy : 0.64124
##
## 'Positive' Class : India Pale Ale
##
#Get an average percentage of Accuracy, Sensitivity, and Specificity of KNN model k=3
set.seed(101)
iterations = 100
numks = 50
masterAcc = matrix(nrow = iterations, ncol = 1)
masterSen = matrix(nrow = iterations, ncol = 1)
masterSpec = matrix(nrow = iterations, ncol = 1)
for(j in 1:iterations)
{
trainIndices = sample(1:dim(all_ales)[1],round(splitPerc * dim(all_ales)[1]))
train = all_ales[trainIndices,]
test = all_ales[-trainIndices,]
classifications = knn(train[,3:4],test[,3:4],train$India.Pale.Ale.Or.Else, prob = TRUE, k = 3)
CM = confusionMatrix(table(classifications,test$India.Pale.Ale.Or.Else))
masterAcc[j,1]=CM$overall[1]
masterSen[j,1]=CM$byClass[1]
masterSpec[j,1]=CM$byClass[2]
}
MeanAcc = colMeans(masterAcc)
MeanSen = colMeans(masterSen)
MeanSpec = colMeans(masterSpec)
MeanAcc
## [1] 0.9328226
MeanSen
## [1] 0.2558146
MeanSpec
## [1] 0.9819341
# Try using Naive Bayes see if it will improve results, split 0.8:
set.seed(102)
splitPerc = 0.8
trainIndices = sample(1:dim(all_ales)[1],round(splitPerc * dim(all_ales)[1]))
train = all_ales[trainIndices,]
test = all_ales[-trainIndices,]
#NB model
model = naiveBayes(train[,3:4],train$India.Pale.Ale.Or.Else)
CM = confusionMatrix(table(predict(model,test[,3:4]),test$India.Pale.Ale.Or.Else))
CM
## Confusion Matrix and Statistics
##
##
## India Pale Ale Other Ale
## India Pale Ale 4 2
## Other Ale 7 111
##
## Accuracy : 0.9274
## 95% CI : (0.8667, 0.9663)
## No Information Rate : 0.9113
## P-Value [Acc > NIR] : 0.3304
##
## Kappa : 0.4352
##
## Mcnemar's Test P-Value : 0.1824
##
## Sensitivity : 0.36364
## Specificity : 0.98230
## Pos Pred Value : 0.66667
## Neg Pred Value : 0.94068
## Prevalence : 0.08871
## Detection Rate : 0.03226
## Detection Prevalence : 0.04839
## Balanced Accuracy : 0.67297
##
## 'Positive' Class : India Pale Ale
##
# Try getting average of Accuracy, Sensitivity and Specificity rate using NB model from 100 random generators
set.seed(101)
splitPerc = .7
iterations = 100
masterAcc = matrix(nrow = iterations, ncol = 1)
masterSen = matrix(nrow = iterations, ncol = 1)
masterSpec = matrix(nrow = iterations, ncol = 1)
for(j in 1:iterations)
{
trainIndices = sample(1:dim(all_ales)[1],round(splitPerc * dim(all_ales)[1]))
train = all_ales[trainIndices,]
test = all_ales[-trainIndices,]
#NB model
model = naiveBayes(train[,3:4],train$India.Pale.Ale.Or.Else)
CM = confusionMatrix(table(predict(model,test[,3:4]),test$India.Pale.Ale.Or.Else))
masterAcc[j,1]=CM$overall[1]
masterSen[j,1]=CM$byClass[1]
masterSpec[j,1]=CM$byClass[2]
}
MeanAcc = colMeans(masterAcc)
MeanSen = colMeans(masterSen)
MeanSpec = colMeans(masterSpec)
MeanAcc
## [1] 0.9011892
MeanSen
## [1] 0.1544105
MeanSpec
## [1] 0.9545286
#import map library
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# Import US Cities location data
uscities <- read.csv("/Users/mingyang/Desktop/SMU/DoingDS_Fall2020/MSDS6306-Case-Study1/uscities.csv",header = TRUE)
uscities = uscities%>%rename(City = city)
uscities = uscities%>% group_by(City) %>% filter(row_number()==1)
# Getting all Ales Beer available
allAles2 = completedData %>% filter(str_detect(completedData$Beer_Name,regex("Ale",ignore.case=TRUE)))
dataWithMap = left_join(allAles2,uscities, by = "City")
str(dataWithMap)
## 'data.frame': 618 obs. of 26 variables:
## $ Beer_Name : chr "British Pale Ale (2010)" "British Pale Ale" "Klickitat Pale Ale" "Dusty Trail Pale Ale" ...
## $ Beer_ID : int 866 48 2402 1474 2592 1818 1326 811 78 76 ...
## $ ABV : num 0.054 0.054 0.053 0.054 0.059 0.042 0.058 0.058 0.058 0.055 ...
## $ IBU : int 30 30 36 55 24 15 15 15 15 28 ...
## $ Brew_ID : int 482 482 118 402 36 172 172 172 172 172 ...
## $ Style : chr "English Pale Ale" "English Pale Ale" "American Pale Ale (APA)" "American Pale Ale (APA)" ...
## $ Ounces : num 16 16 12 16 12 12 12 12 12 12 ...
## $ Brew_Name : chr "7 Seas Brewing Company" "7 Seas Brewing Company" "Alameda Brewing" "Amnesia Brewing Company" ...
## $ City : chr "Gig Harbor" "Gig Harbor" "Portland" "Washougal" ...
## $ State : Factor w/ 51 levels " AK"," AL"," AR",..: 48 48 38 48 5 5 5 5 5 5 ...
## $ city_ascii : chr "Gig Harbor" "Gig Harbor" "Portland" "Washougal" ...
## $ state_id : chr "WA" "WA" "OR" "WA" ...
## $ state_name : chr "Washington" "Washington" "Oregon" "Washington" ...
## $ county_fips : int 53053 53053 41051 53011 6075 29053 29053 29053 29053 29053 ...
## $ county_name : chr "Pierce" "Pierce" "Multnomah" "Clark" ...
## $ lat : num 47.3 47.3 45.5 45.6 37.8 ...
## $ lng : num -123 -123 -123 -122 -122 ...
## $ population : num 10717 10717 2074775 16107 3592294 ...
## $ density : num 701 701 1894 1073 7256 ...
## $ source : chr "polygon" "polygon" "polygon" "polygon" ...
## $ military : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ incorporated: logi TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ timezone : chr "America/Los_Angeles" "America/Los_Angeles" "America/Los_Angeles" "America/Los_Angeles" ...
## $ ranking : int 3 3 1 3 1 3 3 3 3 3 ...
## $ zips : chr "98332 98335" "98332 98335" "97227 97221 97220 97229 97203 97202 97201 97206 97205 97204 97209 97208 97266 97218 97236 97232 97233 97230 972"| __truncated__ "98671" ...
## $ id : int 1840019855 1840019855 1840019941 1840021190 1840021543 1840007420 1840007420 1840007420 1840007420 1840007420 ...
#dataWithMap %>% filter((is.na(lng))|(is.na(lat))) %>% select(Brew_Name,City)
dataWithMap2 = dataWithMap %>% select(Beer_Name,City,lat,lng,state_name)
head(dataWithMap2,100)
## Beer_Name City lat
## 1 British Pale Ale (2010) Gig Harbor 47.3353
## 2 British Pale Ale Gig Harbor 47.3353
## 3 Klickitat Pale Ale Portland 45.5372
## 4 Dusty Trail Pale Ale Washougal 45.5822
## 5 Liberty Ale San Francisco 37.7562
## 6 Keebarlin' Pale Ale Boonville 38.9588
## 7 Boont Amber Ale Boonville 38.9588
## 8 Boont Amber Ale (2010) Boonville 38.9588
## 9 Boont Amber Ale (2011) Boonville 38.9588
## 10 Poleeko Gold Pale Ale (2009) Boonville 38.9588
## 11 River Pig Pale Ale Hayward 37.6328
## 12 On-On Ale (2008) Colorado Springs 38.8674
## 13 Greenbelt Farmhouse Ale Denton 33.2176
## 14 Green Bullet Organic India Pale Ale Boulder 40.0249
## 15 Independence Pass Ale Aspen 39.1950
## 16 Old Red Beard Amber Ale Astoria 46.1856
## 17 Dirty Blonde Ale Detroit 42.3834
## 18 Avery India Pale Ale Boulder 40.0249
## 19 Ellie’s Brown Ale Boulder 40.0249
## 20 Hog Wild India Pale Ale Fuquay-Varina 35.5958
## 21 Back East Golden Ale Bloomfield 36.7401
## 22 Back East Ale Bloomfield 36.7401
## 23 Truck Stop Honey Brown Ale Gadsden 34.0090
## 24 Naked Pig Pale Ale Gadsden 34.0090
## 25 Topcutter India Pale Ale Yakima 46.5923
## 26 Field 41 Pale Ale Yakima 46.5923
## 27 Ballast Point Pale Ale San Diego 32.8312
## 28 Big Eye India Pale Ale San Diego 32.8312
## 29 All Nighter Ale Williamsburg 37.2692
## 30 Banner American Ale Williamsburg 37.2692
## 31 Hayride Autumn Ale Lewiston 44.0915
## 32 Celsius Summer Ale (2012) Lewiston 44.0915
## 33 Pamola Xtra Pale Ale Lewiston 44.0915
## 34 Watermelon Ale Lowell 42.6389
## 35 Fenway American Pale Ale Lowell 42.6389
## 36 Bunker Hill Blueberry Ale Lowell 42.6389
## 37 Bent Paddle Black Ale Duluth 46.7756
## 38 Steel Rail Extra Pale Ale South Deerfield 42.4795
## 39 West Portal Colorado Common Summer Ale Broomfield 39.9541
## 40 413 Farmhouse Ale Sheffield 34.7570
## 41 Scape Goat Pale Ale Missoula 46.8750
## 42 Montana Trout Slayer Ale Missoula 46.8750
## 43 Moose Drool Brown Ale Missoula 46.8750
## 44 Powder Hound Winter Ale Missoula 46.8750
## 45 Moose Drool Brown Ale (2011) Missoula 46.8750
## 46 Montana Trout Slayer Ale (2012) Missoula 46.8750
## 47 Scape Goat Pale Ale (2010) Missoula 46.8750
## 48 Montana Trout Slayer Ale (2009) Missoula 46.8750
## 49 Moose Drool Brown Ale (2009) Missoula 46.8750
## 50 Jalapeno Pale Ale Charlotte 35.2080
## 51 Single Hop Ale Hamilton 39.3938
## 52 Sawtooth Ale Hamilton 39.3938
## 53 Aftermath Pale Ale Temecula 33.4928
## 54 American India Red Ale Denver 39.7621
## 55 Colorado Red Ale Denver 39.7621
## 56 Saddle Bronc Brown Ale Sheridan 44.7962
## 57 Bomber Mountain Amber Ale Sheridan 44.7962
## 58 Coconut Brown Ale Marquette 46.5440
## 59 Brewerhood Brown Ale Lincoln 40.8090
## 60 Last Call Imperial Amber Ale Lincoln 40.8090
## 61 543 Skull Creek Fresh Hopped Pale Ale Lincoln 40.8090
## 62 834 Happy As Ale Lincoln 40.8090
## 63 Full Nelson Pale Ale Afton 44.9042
## 64 Full Nelson Pale Ale (2010) Afton 44.9042
## 65 Blue Point Summer Ale Patchogue 40.7621
## 66 Killer Whale Cream Ale Jacksonville 30.3322
## 67 Duke's Cold Nose Brown Ale Jacksonville 30.3322
## 68 Firestarter India Pale Ale Eagle 43.7223
## 69 Kilt Dropper Scotch Ale Eagle 43.7223
## 70 Farmer Wirtz India Pale Ale Eagle 43.7223
## 71 Slow & Steady Golden Ale Eagle 43.7223
## 72 Moe's Original Bar B Que 'Bama Brew Golden Ale Eagle 43.7223
## 73 Live Local Golden Ale Eagle 43.7223
## 74 Screaming Eagle Special Ale ESB Eagle 43.7223
## 75 Kindler Pale Ale Eagle 43.7223
## 76 Awry Rye Pale Ale Eagle 43.7223
## 77 Demshitz Brown Ale Eagle 43.7223
## 78 Firestarter India Pale Ale Eagle 43.7223
## 79 Samuel Adams Summer Ale Boston 42.3188
## 80 Hoopla Pale Ale Boulder 40.0249
## 81 Bozone Select Amber Ale Bozeman 45.6832
## 82 SummerBright Ale Denver 39.7621
## 83 Avalanche Ale Denver 39.7621
## 84 Rollin Dirty Red Ale Tampa 27.9942
## 85 Le Flaneur Ale Grand Rapids 42.9620
## 86 Hubris Quadrupel Anniversary Ale Grand Rapids 42.9620
## 87 Escoffier Bretta Ale Grand Rapids 42.9620
## 88 Tampa Pale Ale Tampa Bay NA
## 89 Orange Grove Wheat Ale Tampa Bay NA
## 90 Broad Brook Ale East Windsor NA
## 91 Northern Lights Amber Ale Anchorage 61.1508
## 92 Polar Pale Ale Anchorage 61.1508
## 93 Chugach Session Ale Anchorage 61.1508
## 94 East India Pale Ale Brooklyn 40.6501
## 95 Brooklyn Summer Ale Brooklyn 40.6501
## 96 East India Pale Ale Brooklyn 40.6501
## 97 Brooklyn Summer Ale (2011) Brooklyn 40.6501
## 98 Tule Duck Red Ale (Current) Reno 39.5497
## 99 Original Orange Blossom Ale (Current) Reno 39.5497
## 100 Pale Alement Michigan City 41.7099
## lng state_name
## 1 -122.5964 Washington
## 2 -122.5964 Washington
## 3 -122.6500 Oregon
## 4 -122.3448 Washington
## 5 -122.4430 California
## 6 -92.7471 Missouri
## 7 -92.7471 Missouri
## 8 -92.7471 Missouri
## 9 -92.7471 Missouri
## 10 -92.7471 Missouri
## 11 -122.0772 California
## 12 -104.7606 Colorado
## 13 -97.1419 Texas
## 14 -105.2523 Colorado
## 15 -106.8369 Colorado
## 16 -123.8053 Oregon
## 17 -83.1024 Michigan
## 18 -105.2523 Colorado
## 19 -105.2523 Colorado
## 20 -78.7794 North Carolina
## 21 -107.9734 New Mexico
## 22 -107.9734 New Mexico
## 23 -86.0156 Alabama
## 24 -86.0156 Alabama
## 25 -120.5496 Washington
## 26 -120.5496 Washington
## 27 -117.1225 California
## 28 -117.1225 California
## 29 -76.7076 Virginia
## 30 -76.7076 Virginia
## 31 -70.1681 Maine
## 32 -70.1681 Maine
## 33 -70.1681 Maine
## 34 -71.3217 Massachusetts
## 35 -71.3217 Massachusetts
## 36 -71.3217 Massachusetts
## 37 -92.1392 Minnesota
## 38 -72.5947 Massachusetts
## 39 -105.0527 Colorado
## 40 -87.6977 Alabama
## 41 -114.0214 Montana
## 42 -114.0214 Montana
## 43 -114.0214 Montana
## 44 -114.0214 Montana
## 45 -114.0214 Montana
## 46 -114.0214 Montana
## 47 -114.0214 Montana
## 48 -114.0214 Montana
## 49 -114.0214 Montana
## 50 -80.8304 North Carolina
## 51 -84.5653 Ohio
## 52 -84.5653 Ohio
## 53 -117.1315 California
## 54 -104.8759 Colorado
## 55 -104.8759 Colorado
## 56 -106.9643 Wyoming
## 57 -106.9643 Wyoming
## 58 -87.4082 Michigan
## 59 -96.6788 Nebraska
## 60 -96.6788 Nebraska
## 61 -96.6788 Nebraska
## 62 -96.6788 Nebraska
## 63 -92.8174 Minnesota
## 64 -92.8174 Minnesota
## 65 -73.0185 New York
## 66 -81.6749 Florida
## 67 -81.6749 Florida
## 68 -116.3862 Idaho
## 69 -116.3862 Idaho
## 70 -116.3862 Idaho
## 71 -116.3862 Idaho
## 72 -116.3862 Idaho
## 73 -116.3862 Idaho
## 74 -116.3862 Idaho
## 75 -116.3862 Idaho
## 76 -116.3862 Idaho
## 77 -116.3862 Idaho
## 78 -116.3862 Idaho
## 79 -71.0846 Massachusetts
## 80 -105.2523 Colorado
## 81 -111.0552 Montana
## 82 -104.8759 Colorado
## 83 -104.8759 Colorado
## 84 -82.4451 Florida
## 85 -85.6562 Michigan
## 86 -85.6562 Michigan
## 87 -85.6562 Michigan
## 88 NA <NA>
## 89 NA <NA>
## 90 NA <NA>
## 91 -149.1091 Alaska
## 92 -149.1091 Alaska
## 93 -149.1091 Alaska
## 94 -73.9496 New York
## 95 -73.9496 New York
## 96 -73.9496 New York
## 97 -73.9496 New York
## 98 -119.8483 Nevada
## 99 -119.8483 Nevada
## 100 -86.8705 Indiana
dataWithMap3 = dataWithMap2 %>% group_by(City) %>% mutate(count = n())
dataWithMap3 = dataWithMap3 %>% group_by(City)%>%filter(row_number()==1)%>%
filter((!is.na(lng))&(!is.na(lat)))
states <- map_data("state")
p <- ggplot() +
geom_polygon(data = states, aes(x = long, y = lat, group = group), fill = "yellow", color = "black") +
coord_quickmap()
p <-p + geom_point(data = dataWithMap3, aes(x = lng, y = lat, size=count,alpha=count),color="blue")+
ggtitle("What cities produce many kinds of Ales") + xlab("Longitude")+ylab("Latitute")
ggplotly(p)
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
Comment on the summary statistics and distribution of the ABV variable.